library(tidyverse)
library(ggdendro)
library(here)
library(plotly)
library(cluster)
library(ggfortify)
library(broom)
source(here::here("code/plota_solucoes_hclust.R"))
games = read_csv("../data/international-football.csv")
Parsed with column specification:
cols(
  date = col_date(format = ""),
  home_team = col_character(),
  away_team = col_character(),
  home_score = col_integer(),
  away_score = col_integer(),
  tournament = col_character(),
  city = col_character(),
  country = col_character(),
  neutral = col_logical()
)
jogos_brasil = games %>% 
    filter(home_team == "Brazil" | away_team == "Brazil") %>% 
    mutate(time1 = "Brazil", 
           time2 = if_else(home_team == "Brazil", away_team, home_team), 
           score1 = if_else(home_team == "Brazil", home_score, away_score),
           score2 = if_else(home_team == "Brazil", away_score, home_score) 
    ) 
historicos = jogos_brasil %>% 
    group_by(time2) %>% 
    summarise(
        jogos = n(),
        ganhou = sum(score1 > score2) / n(), 
        empatou = sum(score1 == score2) / n(),
        perdeu = sum(score1 < score2) / n()
        )
p <- historicos %>%
  filter(jogos > 2) %>%
  ggplot(aes(x = ganhou, 
             y = jogos, 
             text = paste("Seleção:", time2,
                          "\nVitorio:",ganhou,
                          "\nPerdeu:",perdeu,
                          "\nEmpatou:",empatou))) + 
  geom_point(size = 4,
             color = "#938BA1") +
  labs(y = "Quantidade de Jogos", 
       x = "Proporção de Vitorias.")
ggplotly(p, tooltip = "text") %>%
    layout(autosize = F)
We recommend that you use the dev version of ggplot2 with `ggplotly()`
Install it with: `devtools::install_github('hadley/ggplot2')`
agrupamento_h = historicos %>% 
    filter(jogos > 2) %>%
    as.data.frame() %>% 
    column_to_rownames("time2") %>% 
    select(ganhou) %>%
    dist(method = "euclidian") %>% 
    hclust(method = "ward.D")
ggdendrogram(agrupamento_h, rotate = T, size = 2, theme_dendro = F) + 
    labs(y = "Dissimilaridade", x = "", title = "Dendrograma")

Aplicando o K-means

Com o intuito de busca grupos onde se encaixa os adversarios da seleção Brasileira podemos observar utilizando o algoritimo k-means.

*O algoritmo do K-Means pode ser descrito da seguinte maneira: +1: Escolher k distintos valores para centros dos grupos (possivelmente, de forma aleatória) +2: Associar cada ponto ao centro mais próximo +3: Recalcular o centro de cada grupo +4: Repetir os passos 2-3 até nenhum elemento mudar de grupo.

historico_t = historicos %>% 
    filter(jogos > 2) %>% 
    mutate(jogos = log10(jogos)) 
atribuicoes = tibble(k = 1:6) %>% 
    group_by(k) %>% 
    do(kmeans(select(historico_t,ganhou, jogos), 
              centers = .$k, 
              nstart = 10) %>% augment(historico_t)) # alterne entre filmes e filmes_t no augment  
Unequal factor levels: coercing to characterbinding character and factor vector, coercing into character vectorbinding character and factor vector, coercing into character vectorbinding character and factor vector, coercing into character vectorbinding character and factor vector, coercing into character vectorbinding character and factor vector, coercing into character vectorbinding character and factor vector, coercing into character vector
atribuicoes_long = atribuicoes %>% 
    gather(key = "variavel", value = "valor", -time2, -k, -.cluster, -jogos) 
atribuicoes %>%
    ggplot(aes(x = ganhou, y = jogos, label = time2, colour = .cluster)) + 
    geom_point() + 
    #geom_text() + 
    facet_wrap(~ k) + scale_y_log10()

# A silhoueta
dists = select(historico_t, ganhou, jogos) %>% dist()
km = kmeans(select(historico_t, ganhou, jogos), 
            centers = 4, 
            nstart = 10) 
silhouette(km$cluster, dists) %>% 
    plot(col = RColorBrewer::brewer.pal(4, "Set2"))

set.seed(123)
explorando_k = tibble(k = 1:15) %>% 
    group_by(k) %>% 
    do(
        kmeans(select(historico_t, -time2), 
               centers = .$k, 
               nstart = 20) %>% glance()
    )
explorando_k %>% 
    ggplot(aes(x = k, y = betweenss / totss)) + 
    geom_line() + 
    geom_point()

d.scaled.km.long = km %>% 
    augment(historico_t) %>%
    gather(key = "variável", 
           value = "valor", 
           -time2, -.cluster)
d.scaled.km.long %>% 
    ggplot(aes(x=`variável`, y=valor, group=time2, colour=.cluster)) + 
    geom_line(alpha = .5) + 
    facet_wrap(~ .cluster) 

LS0tCnRpdGxlOiAiUiBOb3RlYm9vayIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKCmBgYHtyfQpsaWJyYXJ5KHRpZHl2ZXJzZSkKbGlicmFyeShnZ2RlbmRybykKbGlicmFyeShoZXJlKQpsaWJyYXJ5KHBsb3RseSkKbGlicmFyeShjbHVzdGVyKQpsaWJyYXJ5KGdnZm9ydGlmeSkKbGlicmFyeShicm9vbSkKc291cmNlKGhlcmU6OmhlcmUoImNvZGUvcGxvdGFfc29sdWNvZXNfaGNsdXN0LlIiKSkKYGBgCgpgYGB7cn0KZ2FtZXMgPSByZWFkX2NzdigiLi4vZGF0YS9pbnRlcm5hdGlvbmFsLWZvb3RiYWxsLmNzdiIpCgpqb2dvc19jb3BhID0gZ2FtZXMgJT4lIAogICAgZmlsdGVyKGhvbWVfdGVhbSA9PSAiQnJhemlsIiB8IGF3YXlfdGVhbSA9PSAiQnJhemlsIikgJT4lIAogICAgbXV0YXRlKHRpbWUxID0gIkJyYXppbCIsIAogICAgICAgICAgIHRpbWUyID0gaWZfZWxzZShob21lX3RlYW0gPT0gIkJyYXppbCIsIGF3YXlfdGVhbSwgaG9tZV90ZWFtKSwgCiAgICAgICAgICAgc2NvcmUxID0gaWZfZWxzZShob21lX3RlYW0gPT0gIkJyYXppbCIsIGhvbWVfc2NvcmUsIGF3YXlfc2NvcmUpLAogICAgICAgICAgIHNjb3JlMiA9IGlmX2Vsc2UoaG9tZV90ZWFtID09ICJCcmF6aWwiLCBhd2F5X3Njb3JlLCBob21lX3Njb3JlKSAKICAgICkgCgpoaXN0b3JpY29zID0gam9nb3NfYnJhc2lsICU+JSAKICAgIGdyb3VwX2J5KHRpbWUyKSAlPiUgCiAgICBzdW1tYXJpc2UoCiAgICAgICAgam9nb3MgPSBuKCksCiAgICAgICAgZ2FuaG91ID0gc3VtKHNjb3JlMSA+IHNjb3JlMikgLyBuKCksIAogICAgICAgIGVtcGF0b3UgPSBzdW0oc2NvcmUxID09IHNjb3JlMikgLyBuKCksCiAgICAgICAgcGVyZGV1ID0gc3VtKHNjb3JlMSA8IHNjb3JlMikgLyBuKCkKICAgICAgICApCmBgYApgYGB7cn0KcCA8LSBoaXN0b3JpY29zICU+JQogIGZpbHRlcihqb2dvcyA+IDIpICU+JQogIGdncGxvdChhZXMoeCA9IGdhbmhvdSwgCiAgICAgICAgICAgICB5ID0gam9nb3MsIAogICAgICAgICAgICAgdGV4dCA9IHBhc3RlKCJTZWxlw6fDo286IiwgdGltZTIsCiAgICAgICAgICAgICAgICAgICAgICAgICAgIlxuVml0b3JpbzoiLGdhbmhvdSwKICAgICAgICAgICAgICAgICAgICAgICAgICAiXG5QZXJkZXU6IixwZXJkZXUsCiAgICAgICAgICAgICAgICAgICAgICAgICAgIlxuRW1wYXRvdToiLGVtcGF0b3UpKSkgKyAKICBnZW9tX3BvaW50KHNpemUgPSA0LAogICAgICAgICAgICAgY29sb3IgPSAiIzkzOEJBMSIpICsKICBsYWJzKHkgPSAiUXVhbnRpZGFkZSBkZSBKb2dvcyIsIAogICAgICAgeCA9ICJQcm9wb3LDp8OjbyBkZSBWaXRvcmlhcy4iKQoKZ2dwbG90bHkocCwgdG9vbHRpcCA9ICJ0ZXh0IikgJT4lCiAgICBsYXlvdXQoYXV0b3NpemUgPSBGKQpgYGAKKiBQb2RlbW9zIG9ic2VydmFyIHF1ZSBvIEJyYXNpbCB0ZW5kZSBhIHRlciB1bSBudW1lcm8gbWFpb3IgZGUgdml0b3JpYXMuCiogT3MgdGltZXMgcXVlIG8gQnJhc2lsIG1haXMgam9nb3Ugc8OjbyB0aW1lcyBkYSBhbWVyaWNhIGxhdGluYSBwb2RlbW9zIGFzc29jaWFyIGlzdG8gYW9zIGNhbXBlb25hdG9zIGNsYXNzaWZpY2F0b3JpbyBwYXJhIGNvcGEgZG8gbXVuZG8uCiogUGFyYSBlc3TDoSBhbmFsaXNlIGNvbnNpZGVyYW1vcyBhcGVuYXMgc2VsZcOnw7VlcyBxdWUgbyBCcmFzaWwgam9nb3UgbWFpcyBkbyBxdWUgMiB2ZXplcy4KCmBgYHtyfQphZ3J1cGFtZW50b19oID0gaGlzdG9yaWNvcyAlPiUgCiAgICBmaWx0ZXIoam9nb3MgPiAyKSAlPiUKICAgIGFzLmRhdGEuZnJhbWUoKSAlPiUgCiAgICBjb2x1bW5fdG9fcm93bmFtZXMoInRpbWUyIikgJT4lIAogICAgc2VsZWN0KGdhbmhvdSkgJT4lCiAgICBkaXN0KG1ldGhvZCA9ICJldWNsaWRpYW4iKSAlPiUgCiAgICBoY2x1c3QobWV0aG9kID0gIndhcmQuRCIpCgpnZ2RlbmRyb2dyYW0oYWdydXBhbWVudG9faCwgcm90YXRlID0gVCwgc2l6ZSA9IDIsIHRoZW1lX2RlbmRybyA9IEYpICsgCiAgICBsYWJzKHkgPSAiRGlzc2ltaWxhcmlkYWRlIiwgeCA9ICIiLCB0aXRsZSA9ICJEZW5kcm9ncmFtYSIpCmBgYAoqIFBvZGVtb3Mgb2JzZXJ2YXIgZW0gdGVybW8gZGUgRGlzc2ltaWxhcmlkYWRlIGEgZGl2aXPDo28gZGUgZ3J1cG9zIGJlbSBwcm94aW1hcy4KCiMjIEFwbGljYW5kbyBvIEstbWVhbnMKQ29tIG8gaW50dWl0byBkZSBidXNjYSBncnVwb3Mgb25kZSBzZSBlbmNhaXhhIG9zIGFkdmVyc2FyaW9zIGRhIHNlbGXDp8OjbyBCcmFzaWxlaXJhIHBvZGVtb3Mgb2JzZXJ2YXIgdXRpbGl6YW5kbyBvIGFsZ29yaXRpbW8gay1tZWFucy4KCipPIGFsZ29yaXRtbyBkbyBLLU1lYW5zIHBvZGUgc2VyIGRlc2NyaXRvIGRhIHNlZ3VpbnRlIG1hbmVpcmE6CiAgKzE6IEVzY29saGVyIGsgZGlzdGludG9zIHZhbG9yZXMgcGFyYSBjZW50cm9zIGRvcyBncnVwb3MgKHBvc3NpdmVsbWVudGUsIGRlIGZvcm1hIGFsZWF0w7NyaWEpCiAgKzI6IEFzc29jaWFyIGNhZGEgcG9udG8gYW8gY2VudHJvIG1haXMgcHLDs3hpbW8KICArMzogUmVjYWxjdWxhciBvIGNlbnRybyBkZSBjYWRhIGdydXBvCiAgKzQ6IFJlcGV0aXIgb3MgcGFzc29zIDItMyBhdMOpIG5lbmh1bSBlbGVtZW50byBtdWRhciBkZSBncnVwby4KCgpgYGB7cn0KaGlzdG9yaWNvX3QgPSBoaXN0b3JpY29zICU+JSAKICAgIGZpbHRlcihqb2dvcyA+IDIpICU+JSAKICAgIG11dGF0ZShqb2dvcyA9IGxvZzEwKGpvZ29zKSkgCgphdHJpYnVpY29lcyA9IHRpYmJsZShrID0gMTo2KSAlPiUgCiAgICBncm91cF9ieShrKSAlPiUgCiAgICBkbyhrbWVhbnMoc2VsZWN0KGhpc3Rvcmljb190LGdhbmhvdSwgam9nb3MpLCAKICAgICAgICAgICAgICBjZW50ZXJzID0gLiRrLCAKICAgICAgICAgICAgICBuc3RhcnQgPSAxMCkgJT4lIGF1Z21lbnQoaGlzdG9yaWNvX3QpKSAjIGFsdGVybmUgZW50cmUgZmlsbWVzIGUgZmlsbWVzX3Qgbm8gYXVnbWVudCAgCgphdHJpYnVpY29lc19sb25nID0gYXRyaWJ1aWNvZXMgJT4lIAogICAgZ2F0aGVyKGtleSA9ICJ2YXJpYXZlbCIsIHZhbHVlID0gInZhbG9yIiwgLXRpbWUyLCAtaywgLS5jbHVzdGVyLCAtam9nb3MpIAoKYXRyaWJ1aWNvZXMgJT4lCiAgICBnZ3Bsb3QoYWVzKHggPSBnYW5ob3UsIHkgPSBqb2dvcywgbGFiZWwgPSB0aW1lMiwgY29sb3VyID0gLmNsdXN0ZXIpKSArIAogICAgZ2VvbV9wb2ludCgpICsgCiAgICAjZ2VvbV90ZXh0KCkgKyAKICAgIGZhY2V0X3dyYXAofiBrKSArIHNjYWxlX3lfbG9nMTAoKQoKIyBBIHNpbGhvdWV0YQpkaXN0cyA9IHNlbGVjdChoaXN0b3JpY29fdCwgZ2FuaG91LCBqb2dvcykgJT4lIGRpc3QoKQprbSA9IGttZWFucyhzZWxlY3QoaGlzdG9yaWNvX3QsIGdhbmhvdSwgam9nb3MpLCAKICAgICAgICAgICAgY2VudGVycyA9IDQsIAogICAgICAgICAgICBuc3RhcnQgPSAxMCkgCgpzaWxob3VldHRlKGttJGNsdXN0ZXIsIGRpc3RzKSAlPiUgCiAgICBwbG90KGNvbCA9IFJDb2xvckJyZXdlcjo6YnJld2VyLnBhbCg0LCAiU2V0MiIpKQpgYGAKCiogUG9kZW1vcyBvYnNlcnZhciBxdWUgYSBkaXZpc8OjbyBlbSA1IGdydXBvIGNvbW8gYSBpZGVhbC4KKiBDb25zaWRlcmFuZG8gbyB1c28gZG8gay1tZWFucyBwb2RlbW9zIHZlcmlmaWNhciBxdWFsIG8gbWVsaG9yIHZhbG9yIGRlIGsgZGUgYWNvcmRvIGNvbSBvIGdyw6FmaWNvIGFiYWl4by4KCmBgYHtyfQpzZXQuc2VlZCgxMjMpCmV4cGxvcmFuZG9fayA9IHRpYmJsZShrID0gMToxNSkgJT4lIAogICAgZ3JvdXBfYnkoaykgJT4lIAogICAgZG8oCiAgICAgICAga21lYW5zKHNlbGVjdChoaXN0b3JpY29fdCwgLXRpbWUyKSwgCiAgICAgICAgICAgICAgIGNlbnRlcnMgPSAuJGssIAogICAgICAgICAgICAgICBuc3RhcnQgPSAyMCkgJT4lIGdsYW5jZSgpCiAgICApCgpleHBsb3JhbmRvX2sgJT4lIAogICAgZ2dwbG90KGFlcyh4ID0gaywgeSA9IGJldHdlZW5zcyAvIHRvdHNzKSkgKyAKICAgIGdlb21fbGluZSgpICsgCiAgICBnZW9tX3BvaW50KCkKYGBgCgoqIFBvZGVtb3Mgb2JzZXJ2YXIgcXVlIGsgZXN0w6EgZW50cmUgb3MgdmFsb3JlcyA1IGUgNi4KKiBDb25zaWRlcmFtb3MgcXVlIMOhIHBhcnRpciBkbyB2YWxvciA2IG7Do28gb2NvcnJlIHVtYSBtdWRhbsOnYSBjb25zaWRlcmFkYSBuYSBsaW5oYSBkYSBjdXJ2YSBlbnTDo28gcG9kZW1vcyBjb25zaWRlcmFyICoqayA9IDUqKi4KCiogT3JnYW5pemFuZG8gb3MgZGFkb3Mgc2VndW5kbyBvcyBncnVwb3MgaWRlbnRpZmljYWRvczoKYGBge3J9CmQuc2NhbGVkLmttLmxvbmcgPSBrbSAlPiUgCiAgICBhdWdtZW50KGhpc3Rvcmljb190KSAlPiUKICAgIGdhdGhlcihrZXkgPSAidmFyacOhdmVsIiwgCiAgICAgICAgICAgdmFsdWUgPSAidmFsb3IiLCAKICAgICAgICAgICAtdGltZTIsIC0uY2x1c3RlcikKYGBgCgpgYGB7cn0KZC5zY2FsZWQua20ubG9uZyAlPiUgCiAgICBnZ3Bsb3QoYWVzKHg9YHZhcmnDoXZlbGAsIHk9dmFsb3IsIGdyb3VwPXRpbWUyLCBjb2xvdXI9LmNsdXN0ZXIpKSArIAogICAgZ2VvbV9saW5lKGFscGhhID0gLjUpICsgCiAgICBmYWNldF93cmFwKH4gLmNsdXN0ZXIpIApgYGAKCiogVmFtb3MgaWRlbnRpZmljYXIgZXNzZXMgZ3J1cG9zOgogICsgTyBncnVwbyAxIHBvZGVtb3MgdmVyaWZpY2FyIHF1ZSDDqSBjb25zdGl0dWlkbyBwb3IgdGltZXMgb25kZSBvIEJyYXNpbCBtYWlzIGpvZ291IGNvbnRyYS4KICArIE8gZ3J1cG8gMiBvYnNlcnZhbW9zIHF1ZSBvIEJyYXNpbCBqb2dvdSB1bSBuw7ptZXJvIGNvbnNpZGVyYWRvIGRlIHBhcnRpZGFzIGUgdGVtIHVtIG7Dum1lcm8gZGUgdml0b3JpYXMgYmVtIGNvbnRyYSBlc3NlcyB0aW1lcy4KICArIE8gZ3J1cG8gMyBDb25zaWRlcmFtb3MgcXVlIMOpIG8gZ3J1cG8gb25kZSB0ZW0gc2VsZcOnw7VlcyBxdWUgbyBCcmFzaWwgbWFpcyBwZXJkZXUuCiAgKyBPIGdydXBvIDQgb2JzZXJ2YW1vcyBxdWUgw6kgb3MgImZyZWd1ZXNlcyIgZG8gQnJhc2lsIHDDs3Mgw6kgYXMgc2VsZcOnw7VlcyBxdWUgbWFpcyBwZXJkZXJhbSBwYXJhIG8gQnJhc2lsLgo=